library(readr)
library(tuber)
#directory <- "data/"
#
#terms <- c(
# "R Programming",
# "Beginner R Programming",
# "Intermediate R Programming",
# "Advanced R Programming"
#)
#
#numTerms <- length(terms)
#
## Grab general info from search endpoint for each term
#for (x in 1:numTerms) {
# term <- terms[x]
# file <- paste(term, ".csv", sep = "")
#
# filePath <- paste(directory, file, sep = "")
#
# videos <- yt_search(term)
# videos <- videos[, c(1,2,3,4,5,6,15)]
# # [1] "video_id" "publishedAt" "channelId"
# # [4] "title" "description" "thumbnails.default.url"
# # [7] "thumbnails.default.width" "thumbnails.default.height" "thumbnails.medium.url"
# #[10] "thumbnails.medium.width" "thumbnails.medium.height" "thumbnails.high.url"
# #[13] "thumbnails.high.width" "thumbnails.high.height" "channelTitle"
# #[16] "liveBroadcastContent" "publishTime"
#
# videos <- videos %>%
# add_column(viewCount = NA,
# likeCount = NA,
# dislikeCount = NA,
# favoriteCount = NA,
# commentCount = NA,
# tags = NA)
#
# videos
# write_csv(videos, filePath)
#}
library(readr)
library(tuber)
library(dplyr)
library(lubridate)
#directory <- "data/"
#
#files <- c(
# "R Programming.csv",
# "Beginner R Programming.csv",
# "Intermediate R Programming.csv",
# "Advanced R Programming.csv"
#)
#
#numFiles <- length(files)
#
## Get details for top 100 results per search (sorted by relevancy)
#for (x in 1:numFiles) {
# file <- files[x]
# filePath <- paste(directory, file, sep = "")
#
# videos <- read_csv(filePath)
#
# videos <- videos %>%
# add_column(viewCount = NA,
# likeCount = NA,
# dislikeCount = NA,
# favoriteCount = NA,
# commentCount = NA,
# tags = NA)
#
# numVideos <- nrow(videos)
# maxRelevant <- 100
#
# for (y in 1:maxRelevant) {
# videoId <- videos[[y,1]]
# channelId <- videos[[y,3]]
#
#
#
# stats <- get_stats(videoId)
# videos[y, 8] <- if ("viewCount" %in% names(stats) && !is.null(stats[["viewCount"]])) as.double(stats$viewCount) else 0
# videos[y, 9] <- if ("likeCount" %in% names(stats) && !is.null(stats[["likeCount"]])) as.double(stats$likeCount) else 0
# videos[y, 10] <- if ("dislikeCount" %in% names(stats) && !is.null(stats[["dislikeCount"]])) as.double(stats$dislikeCount) else 0
# videos[y, 11] <- if ("favoriteCount" %in% names(stats) && !is.null(stats[["favoriteCount"]])) as.double(stats$favoriteCount) else 0
# videos[y, 12] <- if ("commentCount" %in% names(stats) && !is.null(stats[["commentCount"]])) as.double(stats$commentCount) else 0
#
#
#
# details <- get_video_details(videoId)
# items <- details$items[[1]]
# snippet <- items$snippet
#
# tags <- if ("tags" %in% names(snippet) && !is.null(snippet[["tags"]])) snippet$tags else c()
# numTags <- length(tags)
# tagConcat <- ""
#
# for (z in 1:numTags) {
# tagConcat <- paste(tagConcat, tags[[z]], sep = if (z == 1) "" else ",")
# }
#
# videos[y, 2] <- round_date(videos[y, 2], "day")
# videos[y, 5] <- if ("description" %in% names(snippet) && !is.null(snippet[["description"]])) snippet$description else ""
# videos[y, 13] <- tagConcat
#
#
#
# #captions <- get_captions(videoId)
#
#
#
# #comments <- get_comment_threads(c(video_id = videoId))
# }
#}
#
#videos
#write_csv(videos, filePath)
library(readr)
library(dplyr)
library(lubridate)
library(stringr)
directory <- "data/"
rFilePath <- paste(directory, "R Programming.csv", sep = "")
beginnerFilePath <- paste(directory, "Beginner R Programming.csv", sep = "")
intermediateFilePath <- paste(directory, "Intermediate R Programming.csv", sep = "")
advancedFilePath <- paste(directory, "Advanced R Programming.csv", sep = "")
rVideos <- read_csv(rFilePath)
## Rows: 594 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): video_id, channelId, title, description, thumbnails.default.url, c...
## dbl (5): viewCount, likeCount, dislikeCount, favoriteCount, commentCount
## dttm (1): publishedAt
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
beginnerVideos <- read_csv(beginnerFilePath)
## Rows: 597 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): video_id, channelId, title, description, thumbnails.default.url, c...
## dbl (5): viewCount, likeCount, dislikeCount, favoriteCount, commentCount
## dttm (1): publishedAt
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
intermediateVideos <- read_csv(intermediateFilePath)
## Rows: 566 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): video_id, channelId, title, description, thumbnails.default.url, c...
## dbl (5): viewCount, likeCount, dislikeCount, favoriteCount, commentCount
## dttm (1): publishedAt
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
advancedVideos <- read_csv(advancedFilePath)
## Rows: 601 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): video_id, channelId, title, description, thumbnails.default.url, c...
## dbl (5): viewCount, likeCount, dislikeCount, favoriteCount, commentCount
## dttm (1): publishedAt
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
maxRelevant <- 100
# Dedupe records from generic search and other targeted searches for goo measure
beginnerVideosUnique <- beginnerVideos[1:maxRelevant,] %>%
setdiff(rVideos) %>%
setdiff(intermediateVideos) %>%
setdiff(advancedVideos)
## Warning: One or more parsing issues, see `problems()` for details
## Warning: One or more parsing issues, see `problems()` for details
## Warning: One or more parsing issues, see `problems()` for details
## Warning: One or more parsing issues, see `problems()` for details
intermediateVideosUnique <- intermediateVideos[1:maxRelevant,] %>%
setdiff(rVideos) %>%
setdiff(beginnerVideos) %>%
setdiff(advancedVideos)
advancedVideosUnique <- advancedVideos[1:maxRelevant,] %>%
setdiff(rVideos) %>%
setdiff(beginnerVideos) %>%
setdiff(intermediateVideos)
beginnerVideosUnique <- beginnerVideosUnique %>%
add_column(level = "beginner",
rVersion = NA,
relevant = NA)
intermediateVideosUnique <- intermediateVideosUnique %>%
add_column(level = "intermediate",
rVersion = NA,
relevant = NA)
advancedVideosUnique <- advancedVideosUnique %>%
add_column(level = "advanced",
rVersion = NA,
relevant = NA)
# Check relevancy in deduped according to the appearance of "R" in title, description, or tag
beginnerVideosUnique <- beginnerVideosUnique %>%
mutate(publishedAt = round_date(beginnerVideosUnique$publishedAt, "day"),
relevant = str_detect(beginnerVideosUnique$title, regex("\\bR\\b", ignore_case = TRUE))
| str_detect(beginnerVideosUnique$tags, regex("\\bR\\b", ignore_case = TRUE))
| str_detect(beginnerVideosUnique$description, regex("\\bR\\b", ignore_case = TRUE)))
intermediateVideosUnique <- intermediateVideosUnique %>%
mutate(publishedAt = round_date(intermediateVideosUnique$publishedAt, "day"),
relevant = str_detect(intermediateVideosUnique$title, regex("\\bR\\b", ignore_case = TRUE))
| str_detect(intermediateVideosUnique$tags, regex("\\bR\\b", ignore_case = TRUE))
| str_detect(intermediateVideosUnique$description, regex("\\bR\\b", ignore_case = TRUE)))
advancedVideosUnique <- advancedVideosUnique %>%
mutate(publishedAt = round_date(advancedVideosUnique$publishedAt, "day"),
relevant = str_detect(advancedVideosUnique$title, regex("\\bR\\b", ignore_case = TRUE))
| str_detect(advancedVideosUnique$tags, regex("\\bR\\b", ignore_case = TRUE))
| str_detect(advancedVideosUnique$description, regex("\\bR\\b", ignore_case = TRUE)))
rVersionFile <- "R Versions.csv"
rVersionsPath <- paste(directory, rVersionFile, sep = "")
rVersions <- read_delim(rVersionsPath, trim_ws = TRUE)
## New names:
## * `` -> ...1
## Rows: 89 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ...1, date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rVersions <- rVersions %>%
add_column(dateReleased = NA)
dateSplits <- str_split(rVersions$date, ",")
numDateSplits <- length(dateSplits)
for (x in 1:numDateSplits) {
rVersions[x,3] <- as.POSIXct(paste(
dateSplits[[x]][2],
match(dateSplits[[x]][1], month.name),
"01",
sep = "-"
), format = "%Y-%m-%d")
}
write_csv(rVersions, paste(directory, rVersionFile, sep = ""))
# Check R version available at time of video release
numVersions <- nrow(rVersions)
numBeginnerVideos <- nrow(beginnerVideosUnique)
for (x in 1:numBeginnerVideos) {
loops <- 1
while (loops <= numVersions) {
if (beginnerVideosUnique[x, 2] < rVersions[loops, 3]) {
loops <- loops + 1
}
else {
beginnerVideosUnique[x, 15] <- rVersions[loops, 1]
break
}
}
}
numIntermediateVideos <- nrow(intermediateVideosUnique)
for (x in 1:numIntermediateVideos) {
loops <- 1
while (loops <= numVersions) {
if (intermediateVideosUnique[x, 2] < rVersions[loops, 3]) {
loops <- loops + 1
}
else {
intermediateVideosUnique[x, 15] <- rVersions[loops, 1]
break
}
}
}
numAdvancedVideos <- nrow(advancedVideosUnique)
for (x in 1:numAdvancedVideos) {
loops <- 1
while (loops <= numVersions) {
if (advancedVideosUnique[x, 2] < rVersions[loops, 3]) {
loops <- loops + 1
}
else {
advancedVideosUnique[x, 15] <- rVersions[loops, 1]
break
}
}
}
uniqueAll <- union(union(beginnerVideosUnique, intermediateVideosUnique), advancedVideosUnique)
write_csv(beginnerVideosUnique, paste(directory, "Beginner R Programming Unique.csv", sep = ""))
write_csv(intermediateVideosUnique, paste(directory, "Intermediate R Programming Unique.csv", sep = ""))
write_csv(advancedVideosUnique, paste(directory, "Advanced R Programming Unique.csv", sep = ""))
write_csv(uniqueAll, paste(directory, "R Programming Unique.csv", sep = ""))
library(readr)
library(ggplot2)
library(runner)
library(stringr)
library("stopwords")
library("wordcloud")
library(ggrepel)
library(psych)
directory <- "data/"
rUniqueFilePath <- paste(directory, "R Programming Unique.csv", sep = "")
rVideosUnique <- read_delim(rUniqueFilePath, trim_ws = TRUE)
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 278 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): video_id, channelId, title, description, thumbnails.default.url, c...
## dbl (5): viewCount, likeCount, dislikeCount, favoriteCount, commentCount
## lgl (1): relevant
## dttm (1): publishedAt
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rVideosUniqueRelevant <- rVideosUnique %>%
filter(relevant == TRUE)
# "Misinformation"
# 66/92 = 71%
beginnerVideosUniqueRelevant <- rVideosUnique %>%
filter(level == "beginner")
# 71/95 = 75%
intermediateVideosUniqueRelevant <- rVideosUnique %>%
filter(level == "intermediate")
# 90/91 = 99%
advancedVideosUniqueRelevant <- rVideosUnique %>%
filter(level == "advanced")
# When did the market saturate?
ggplot(beginnerVideosUniqueRelevant, aes(publishedAt, viewCount)) +
ggtitle("Beginner Relevancy by Date") +
geom_point(aes(colour = factor(relevant)))

ggplot(intermediateVideosUniqueRelevant, aes(publishedAt, viewCount)) +
ggtitle("Intermediate Relevancy by Date") +
geom_point(aes(colour = factor(relevant)))

ggplot(advancedVideosUniqueRelevant, aes(publishedAt, viewCount)) +
ggtitle("Advanced Relevancy by Date") +
geom_point(aes(colour = factor(relevant)))

# Assuming most of a video's traffic occurs within the first couple of months, then viewership has increased at a steady rate
# Amount of older videos can also speak to scarcity of newer videos as sorted by relevancy, one might expect new videos to be more relevant
ggplot(rVideosUniqueRelevant %>%
arrange(publishedAt), aes(publishedAt, sum_run(
x = viewCount,
idx = publishedAt
))
) +
ggtitle("Cumulative Views by Date") +
geom_smooth(method = lm) +
geom_point()
## `geom_smooth()` using formula 'y ~ x'

ggplot(rVideosUniqueRelevant, aes(publishedAt, viewCount)) +
ggtitle("Views by Date") +
geom_point(aes(colour = factor(level)))

ggplot(rVideosUniqueRelevant %>%
filter(viewCount < 1000), aes(publishedAt, viewCount)) +
ggtitle("Views by Date") +
geom_point(aes(colour = factor(level)))

ggplot(rVideosUniqueRelevant, aes(rVersion, viewCount)) +
geom_point(aes(colour = factor(level))) +
ggtitle("Views by R version") +
theme(axis.text.x = element_text(angle = 45))

rVideosUniqueRelevant$viewCount %>%
describe
## vars n mean sd median trimmed mad min max range skew
## X1 1 227 36420.6 154786.2 196 5635.71 268.35 0 1814387 1814387 8.38
## kurtosis se
## X1 83.93 10273.52
rVideosUniqueRelevant %>%
summary
## video_id publishedAt channelId
## Length:227 Min. :2010-02-07 00:00:00 Length:227
## Class :character 1st Qu.:2019-06-01 00:00:00 Class :character
## Mode :character Median :2020-09-17 00:00:00 Mode :character
## Mean :2019-08-22 20:05:17
## 3rd Qu.:2021-01-22 00:00:00
## Max. :2021-10-30 00:00:00
## title description thumbnails.default.url
## Length:227 Length:227 Length:227
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## channelTitle viewCount likeCount dislikeCount
## Length:227 Min. : 0 Min. : 0.0 Min. : 0.00
## Class :character 1st Qu.: 37 1st Qu.: 1.0 1st Qu.: 0.00
## Mode :character Median : 196 Median : 4.0 Median : 0.00
## Mean : 36421 Mean : 649.5 Mean : 10.65
## 3rd Qu.: 4324 3rd Qu.: 64.5 3rd Qu.: 1.50
## Max. :1814387 Max. :41152.0 Max. :513.00
## favoriteCount commentCount tags level
## Min. :0 Min. : 0.00 Length:227 Length:227
## 1st Qu.:0 1st Qu.: 0.00 Class :character Class :character
## Median :0 Median : 0.00 Mode :character Mode :character
## Mean :0 Mean : 34.89
## 3rd Qu.:0 3rd Qu.: 4.50
## Max. :0 Max. :886.00
## rVersion relevant
## Length:227 Mode:logical
## Class :character TRUE:227
## Mode :character
##
##
##
beginnerVideosUniqueRelevant$viewCount %>%
describe
## vars n mean sd median trimmed mad min max range skew
## X1 1 92 465864.9 1874922 2635.5 101214.3 3905.17 0 13934381 13934381 6.09
## kurtosis se
## X1 37.76 195474.1
beginnerVideosUniqueRelevant %>%
summary
## video_id publishedAt channelId
## Length:92 Min. :2013-10-31 00:00:00 Length:92
## Class :character 1st Qu.:2019-02-21 12:00:00 Class :character
## Mode :character Median :2020-04-16 12:00:00 Mode :character
## Mean :2019-09-13 10:26:05
## 3rd Qu.:2020-09-22 06:00:00
## Max. :2021-10-30 00:00:00
## title description thumbnails.default.url
## Length:92 Length:92 Length:92
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## channelTitle viewCount likeCount dislikeCount
## Length:92 Min. : 0 Min. : 0.0 Min. : 0.00
## Class :character 1st Qu.: 88 1st Qu.: 1.0 1st Qu.: 0.00
## Mode :character Median : 2636 Median : 45.5 Median : 1.00
## Mean : 465865 Mean : 10781.2 Mean : 186.73
## 3rd Qu.: 163410 3rd Qu.: 3679.8 3rd Qu.: 62.25
## Max. :13934381 Max. :336767.0 Max. :4412.00
## favoriteCount commentCount tags level
## Min. :0 Min. : 0.0 Length:92 Length:92
## 1st Qu.:0 1st Qu.: 0.0 Class :character Class :character
## Median :0 Median : 3.5 Mode :character Mode :character
## Mean :0 Mean : 457.0
## 3rd Qu.:0 3rd Qu.: 222.2
## Max. :0 Max. :15001.0
## rVersion relevant
## Length:92 Mode :logical
## Class :character FALSE:25
## Mode :character TRUE :66
## NA's :1
##
##
intermediateVideosUniqueRelevant$viewCount %>%
describe
## vars n mean sd median trimmed mad min max range skew
## X1 1 95 220691.8 739024 619 35749.96 911.8 0 4617704 4617704 4.41
## kurtosis se
## X1 20.27 75822.26
intermediateVideosUniqueRelevant %>%
summary
## video_id publishedAt channelId
## Length:95 Min. :2010-02-07 00:00:00 Length:95
## Class :character 1st Qu.:2017-11-13 12:00:00 Class :character
## Mode :character Median :2020-05-05 00:00:00 Mode :character
## Mean :2018-09-25 20:27:47
## 3rd Qu.:2021-01-09 12:00:00
## Max. :2021-12-02 00:00:00
## title description thumbnails.default.url
## Length:95 Length:95 Length:95
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## channelTitle viewCount likeCount dislikeCount
## Length:95 Min. : 0 Min. : 0 Min. : 0.00
## Class :character 1st Qu.: 62 1st Qu.: 1 1st Qu.: 0.00
## Mode :character Median : 619 Median : 6 Median : 0.00
## Mean : 220692 Mean : 3954 Mean : 71.79
## 3rd Qu.: 75637 3rd Qu.: 567 3rd Qu.: 6.50
## Max. :4617704 Max. :87535 Max. :1679.00
## favoriteCount commentCount tags level
## Min. :0 Min. : 0.0 Length:95 Length:95
## 1st Qu.:0 1st Qu.: 0.0 Class :character Class :character
## Median :0 Median : 0.0 Mode :character Mode :character
## Mean :0 Mean : 108.4
## 3rd Qu.:0 3rd Qu.: 52.5
## Max. :0 Max. :2227.0
## rVersion relevant
## Length:95 Mode :logical
## Class :character FALSE:19
## Mode :character TRUE :71
## NA's :5
##
##
advancedVideosUniqueRelevant$viewCount %>%
describe
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 91 4823.18 13816.58 89 1321.68 118.61 0 86035 86035 4.1 17.89
## se
## X1 1448.37
advancedVideosUniqueRelevant %>%
summary
## video_id publishedAt channelId
## Length:91 Min. :2015-11-25 00:00:00 Length:91
## Class :character 1st Qu.:2020-07-12 00:00:00 Class :character
## Mode :character Median :2020-11-05 00:00:00 Mode :character
## Mean :2020-06-26 00:15:49
## 3rd Qu.:2021-03-07 00:00:00
## Max. :2021-10-28 00:00:00
## title description thumbnails.default.url
## Length:91 Length:91 Length:91
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## channelTitle viewCount likeCount dislikeCount
## Length:91 Min. : 0 Min. : 0.00 Min. : 0.000
## Class :character 1st Qu.: 31 1st Qu.: 0.00 1st Qu.: 0.000
## Mode :character Median : 89 Median : 2.00 Median : 0.000
## Mean : 4823 Mean : 92.35 Mean : 1.604
## 3rd Qu.: 1263 3rd Qu.: 17.50 3rd Qu.: 0.000
## Max. :86035 Max. :1426.00 Max. :22.000
## favoriteCount commentCount tags level
## Min. :0 Min. : 0.000 Length:91 Length:91
## 1st Qu.:0 1st Qu.: 0.000 Class :character Class :character
## Median :0 Median : 0.000 Mode :character Mode :character
## Mean :0 Mean : 6.681
## 3rd Qu.:0 3rd Qu.: 2.000
## Max. :0 Max. :188.000
## rVersion relevant
## Length:91 Mode :logical
## Class :character FALSE:1
## Mode :character TRUE :90
##
##
##
ggplot(rVideosUniqueRelevant, aes(level, viewCount)) +
ggtitle("Views by Difficulty") +
geom_boxplot()

ggplot(rVideosUniqueRelevant, aes(level, likeCount)) +
ggtitle("Likes by Difficulty") +
geom_boxplot()

ggplot(rVideosUniqueRelevant, aes(level, dislikeCount)) +
ggtitle("Dislikes by Difficulty") +
geom_boxplot()

ggplot(rVideosUniqueRelevant, aes(level, commentCount)) +
ggtitle("Comments by Difficulty") +
geom_boxplot()

rVideosUniqueRelevantChannels <- rVideosUniqueRelevant %>%
group_by(channelTitle) %>%
summarise(n = n(), viewCount = sum(viewCount))
ggplot(rVideosUniqueRelevantChannels, aes(n, viewCount)) +
geom_point() +
ggtitle("All Channel Videos Count by Views") +
geom_label_repel(aes(label = channelTitle),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50')
## Warning: ggrepel: 91 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

beginnerVideosUniqueRelevantChannels <- beginnerVideosUniqueRelevant %>%
group_by(channelTitle) %>%
summarise(n = n(), viewCount = sum(viewCount))
ggplot(beginnerVideosUniqueRelevantChannels, aes(n, viewCount)) +
geom_point() +
ggtitle("Beginner Channel Video Count by Views") +
geom_label_repel(aes(label = channelTitle),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50')
## Warning: ggrepel: 54 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

intermediateVideosUniqueRelevantChannels <- intermediateVideosUniqueRelevant %>%
group_by(channelTitle) %>%
summarise(n = n(), viewCount = sum(viewCount))
ggplot(intermediateVideosUniqueRelevantChannels, aes(n, viewCount)) +
geom_point() +
ggtitle("Intermediate Channel Video Count by Views") +
geom_label_repel(aes(label = channelTitle),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50')
## Warning: ggrepel: 33 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

advancedVideosUniqueRelevantChannels <- advancedVideosUniqueRelevant %>%
group_by(channelTitle) %>%
summarise(n = n(), viewCount = sum(viewCount))
ggplot(advancedVideosUniqueRelevantChannels, aes(n, viewCount)) +
geom_point() +
ggtitle("Advanced Channel Video Count by Views") +
geom_label_repel(aes(label = channelTitle),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50')
## Warning: ggrepel: 18 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

maxWords = 100
filterRegex <- ".*[0-9][0-9]+$|^[0-9]*$|.*\\..*"
titleWords <- rVideosUniqueRelevant[,4] %>%
unnest_tokens(output = word, input = title) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
## Joining, by = "word"
wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

titleWords <- beginnerVideosUniqueRelevant[,4] %>%
unnest_tokens(output = word, input = title) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
## Joining, by = "word"
wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

titleWords <- intermediateVideosUniqueRelevant[,4] %>%
unnest_tokens(output = word, input = title) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
## Joining, by = "word"
wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

titleWords <- advancedVideosUniqueRelevant[,4] %>%
unnest_tokens(output = word, input = title) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
## Joining, by = "word"
wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

descriptionWords <- rVideosUniqueRelevant[,5] %>%
unnest_tokens(output = word, input = description) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
## Joining, by = "word"
wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

descriptionWords <- beginnerVideosUniqueRelevant[,5] %>%
unnest_tokens(output = word, input = description) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
## Joining, by = "word"
wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

descriptionWords <- intermediateVideosUniqueRelevant[,5] %>%
unnest_tokens(output = word, input = description) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
## Joining, by = "word"
wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

descriptionWords <- advancedVideosUniqueRelevant[,5] %>%
unnest_tokens(output = word, input = description) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
## Joining, by = "word"
wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

rVideosUniqueRelevant$tags <- gsub(",", " ", rVideosUniqueRelevant$tags)
tagsWords <- rVideosUniqueRelevant[,13] %>%
unnest_tokens(output = word, input = tags) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
## Joining, by = "word"
wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)

beginnerVideosUniqueRelevant$tags <- gsub(",", " ", beginnerVideosUniqueRelevant$tags)
tagsWords <- beginnerVideosUniqueRelevant[,13] %>%
unnest_tokens(output = word, input = tags) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
## Joining, by = "word"
wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)

intermediateVideosUniqueRelevant$tags <- gsub(",", " ", intermediateVideosUniqueRelevant$tags)
tagsWords <- intermediateVideosUniqueRelevant[,13] %>%
unnest_tokens(output = word, input = tags) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
## Joining, by = "word"
wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)

advancedVideosUniqueRelevant$tags <- gsub(",", " ", advancedVideosUniqueRelevant$tags)
tagsWords <- advancedVideosUniqueRelevant[,13] %>%
unnest_tokens(output = word, input = tags) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
## Joining, by = "word"
wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)
